The gene trees were constructed using ETE toolkit:
Preparatory step: installing Miniconda, following the instructions from http://etetoolkit.org/download/
The gene trees were estimated using RAxML:
ete3 build -w standard_raxml -n /Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-114-taxa/FASTA/uce-3.fasta -o /Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-114-taxa/FASTA/uce-3-tree
Automatically generate the corresponding command for each of the 230 alignment files in the directory:
cd /Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-114-taxa/FASTA
ls
Copy the output to a text file:
# Get the 1st part of the command
locus_table <- read.table("Set1-individual-loci.txt")
n <- length(unlist(locus_table))
inputfile <- vector()
for(i in locus_table) {
inputfile <- append(inputfile, paste("ete3 build -w standard_raxml -n /Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-114-taxa/FASTA/", i, sep = ""))
}
# Length check
length(inputfile) - n
# Remove file endings from the locus names
names_and_endings <- as.character(as.vector(as.matrix(locus_table)))
locus_names <- vector()
for(i in names_and_endings) {
locus_names <- append(locus_names, sapply(strsplit(i, split='.', fixed=TRUE), function(x) (x[1])))
}
# Get the 2nd part of the command
outputfile <- vector()
for(i in locus_names) {
outputfile <- append(outputfile, paste(" -o /Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-114-taxa/", i, sep=""))
}
# Length check
length(outputfile) - n
# Get the entire command
commands <- paste(inputfile, outputfile, "-tree &&", sep="")
# Length check
length(commands) - n
# Uncomment to print to file:
# write(commands, "gene-tree-analysis.sh")
#!/bin/bash was then inserted into the first line of the file to convert it into a shell script. The script was run as follows:
chmod 755 /Users/David/Grive/Alfaro_Lab/SortaDate/Locus_analysis/gene-tree-analysis.sh
/Users/David/Grive/Alfaro_Lab/SortaDate/Locus_analysis/gene-tree-analysis.sh
Write a new script that will copy the trees from the nested subdirectories to the same directory where the alignment files are located:
# 1st part of the command
copyfiles <- vector()
for(i in locus_names) {
copyfiles <- append(copyfiles, paste("cp /Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-114-taxa/", i, sep = ""))
}
# 2nd part of the command
subdirectories <- vector()
for(i in locus_names) {
subdirectories <- append(subdirectories, paste("-tree/clustalo_default-none-none-raxml_default/", i, sep = ""))
}
# Get the entire command
copying <- paste(copyfiles, subdirectories, ".fasta.final_tree.nw /Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-114-taxa/FASTA/ &&", sep="")
# Uncomment to print to file:
# write(copying, "copy-trees.sh")
Write a third script to rename the tree files so that they have the .tre file ending, which SortaDate looks for while searching its target directory:
# 1st part of the command
oldname <- vector()
for(i in locus_names) {
oldname <- append(oldname, paste("mv /Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-114-taxa/FASTA/", i, sep = ""))
}
# 2nd part of the command
newname <- vector()
for(i in locus_names) {
newname <- append(newname, paste(".fasta.final_tree.nw /Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-114-taxa/FASTA/", i, sep = ""))
}
# Get the entire command:
renametrees <- paste(oldname, newname, ".tre &&", sep = "")
# Uncomment to print to file:
write(renametrees, "rename-trees.sh")
python src/get_var_length.py /Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-114-taxa/FASTA/ --flend .tre --outf Locus_analysis/var-uces --outg alepisaurus_ferox,ceratoscopelus_warmingii
python src/get_bp_genetrees.py /Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-114-taxa/FASTA/ /Users/David/Grive/Alfaro_Lab/Acanthomorpha/12_no_outgroups_scheme_3.tre --flend .tre --outf Locus_analysis/bp-uces
python src/combine_results.py Locus_analysis/var-uces Locus_analysis/bp-uces --outf Locus_analysis/comb-uces
In order of descending priority: bipartition support, root-to-tip variance, tree length
python src/get_good_genes.py Locus_analysis/comb-uces --max 3 --order 3,1,2 --outf Locus_analysis/gg-uces-312
name root-to-tip_var treelength bipartition
uce-1184.tre 0.00510364 5.59114 0.491379310345
uce-383.tre 0.0094896 4.68126 0.465517241379
uce-1317.tre 0.0200846 8.06488 0.465517241379
In order of descending priority: bipartition support, tree length, root-to-tip variance
python src/get_good_genes.py Locus_analysis/comb-uces --max 3 --order 3,2,1 --outf Locus_analysis/gg-uces-321
name root-to-tip_var treelength bipartition
uce-1184.tre 0.00510364 5.59114 0.491379310345
uce-383.tre 0.0094896 4.68126 0.465517241379
uce-1317.tre 0.0200846 8.06488 0.465517241379
In order of descending priority: root-to-tip variance, tree length, bipartition support
python src/get_good_genes.py Locus_analysis/comb-uces --max 3 --order 1,2,3 --outf Locus_analysis/gg-uces-123
name root-to-tip_var treelength bipartition
uce-1075.tre 7.59021e-05 0.363795 0.0689655172414
uce-446.tre 0.000230387 0.897702 0.26724137931
uce-855.tre 0.000309407 1.03296 0.301724137931
In order of descending priority: root-to-tip variance, bipartition support, tree length
python src/get_good_genes.py Locus_analysis/comb-uces --max 3 --order 1,3,2 --outf Locus_analysis/gg-uces-132
name root-to-tip_var treelength bipartition
uce-1075.tre 7.59021e-05 0.363795 0.0689655172414
uce-446.tre 0.000230387 0.897702 0.26724137931
uce-855.tre 0.000309407 1.03296 0.301724137931
In order of descending priority: tree length, root-to-tip variance, bipartition support
python src/get_good_genes.py Locus_analysis/comb-uces --max 3 --order 2,1,3 --outf Locus_analysis/gg-uces-213
name root-to-tip_var treelength bipartition
uce-1075.tre 7.59021e-05 0.363795 0.0689655172414
uce-737.tre 0.000743068 0.814482 0.137931034483
uce-446.tre 0.000230387 0.897702 0.26724137931
In order of descending priority: tree length, bipartition support, root-to-tip variance
python src/get_good_genes.py Locus_analysis/comb-uces --max 3 --order 2,3,1 --outf Locus_analysis/gg-uces-231
name root-to-tip_var treelength bipartition
uce-1075.tre 7.59021e-05 0.363795 0.0689655172414
uce-737.tre 0.000743068 0.814482 0.137931034483
uce-446.tre 0.000230387 0.897702 0.26724137931
Note that the largest, slowest-evolving partition (“ccf55a6ee6d62f840a124bcc0c98ecf5”; 132 kb) was excluded from the first round of analyses for computational reasons. Attempts to analyze it in RAxML after the remaining 31 analyses finished up were unsuccessful.
# Get the 1st part of the command
kmeans_table <- read.table("Set2-kmeans-partitions.txt")
n2 <- length(unlist(kmeans_table))
first <- vector()
for(i in kmeans_table) {
first <- append(first, paste("ete3 build -w standard_raxml -n /Users/David/Grive/Alfaro_Lab/Acanthomorpha/k-means_partitions/", i, sep = ""))
}
# Length check
length(first) - n2
# Remove file endings from the locus names
no_endings <- as.character(as.vector(as.matrix(kmeans_table)))
kmeans_names <- vector()
for(i in no_endings) {
kmeans_names <- append(kmeans_names, sapply(strsplit(i, split='.', fixed=TRUE), function(x) (x[1])))
}
kmeans_names
# Get the 2nd part of the command
second <- vector()
for(i in kmeans_names) {
second <- append(second, paste(" -o /Users/David/Grive/Alfaro_Lab/Acanthomorpha/k-means_partitions/", i, sep=""))
}
second
# Length check
length(second) - n2
# Get the entire command
kmeansscript <- paste(first, second, "-tree &&", sep="")
# Length check
length(kmeansscript) - n2
# Uncomment to print to file:
# write(kmeansscript, "kmeans-analysis.sh")
Copy the trees into the directory containing the alignments:
# 1st part of the command
copytrees <- vector()
for(i in kmeans_names) {
copytrees <- append(copytrees, paste("cp /Users/David/Grive/Alfaro_Lab/Acanthomorpha/k-means_partitions/", i, sep = ""))
}
# 2nd part of the command
locations <- vector()
for(i in kmeans_names) {
locations <- append(locations, paste("-tree/clustalo_default-none-none-raxml_default/", i, sep = ""))
}
# Get the entire command
finalstep <- paste(copytrees, locations, ".phy.fasta.final_tree.nw /Users/David/Grive/Alfaro_Lab/Acanthomorpha/k-means_partitions/FASTA/ &&", sep="")
# Uncomment to print to file:
# write(finalstep, "copy-kmeans-trees.sh")
Change the tree names so that they correspond to the partition names:
# 1st part of the command
oldtreename <- vector()
for(i in kmeans_names) {
oldtreename <- append(oldtreename, paste("mv /Users/David/Grive/Alfaro_Lab/Acanthomorpha/k-means_partitions/FASTA/", i, sep = ""))
}
# 2nd part of the command
newtreename <- vector()
for(i in kmeans_names) {
newtreename <- append(newtreename, paste(".phy.fasta.final_tree.nw /Users/David/Grive/Alfaro_Lab/Acanthomorpha/k-means_partitions/FASTA/", i, sep = ""))
}
# Get the entire command:
changetreenames <- paste(oldtreename, newtreename, ".tre &&", sep = "")
# Uncomment to print to file:
# write(changetreenames, "rename-kmeans-trees.sh")
Finally, rename the partitions:
# 1st part of the command
oldpartition <- vector()
for(i in kmeans_names) {
oldpartition <- append(oldpartition, paste("mv /Users/David/Grive/Alfaro_Lab/Acanthomorpha/k-means_partitions/FASTA/", i, sep = ""))
}
# 2nd part of the command
newpartition <- vector()
for(i in kmeans_names) {
newpartition <- append(newpartition, paste(".phy.fasta /Users/David/Grive/Alfaro_Lab/Acanthomorpha/k-means_partitions/FASTA/", i, sep = ""))
}
# Get the entire command
renamepartitions <- paste(oldpartition, newpartition, ".fasta &&", sep = "")
# Uncomment to print to file:
# write(renamepartitions, "rename-partitions.sh")
python src/get_var_length.py /Users/David/Grive/Alfaro_Lab/Acanthomorpha/k-means_partitions/FASTA/ --flend .tre --outf var-kmeans --outg alepisaurus_ferox,ceratoscopelus_warmingii
python src/get_bp_genetrees.py /Users/David/Grive/Alfaro_Lab/Acanthomorpha/k-means_partitions/FASTA/ /Users/David/Grive/Alfaro_Lab/Acanthomorpha/12_no_outgroups_scheme_3.tre --flend .tre --outf bp-kmeans
python src/combine_results.py var-kmeans bp-kmeans --outf comb-kmeans
In order of descending priority: bipartition support, root-to-tip variance, tree length
name root-to-tip_var treelength bipartition
8be7c94dcf2d71970c663f6710af40d7.tre 0.0878182 19.8649 0.560344827586
f495e4e0f2f9bbbf091f778067b062f4.tre 0.0230499 11.0301 0.508620689655
0061a6137d1029978876fe13239f57bc.tre 0.0205424 11.2184 0.5
In order of descending priority: bipartition support, tree length, root-to-tip variance
name root-to-tip_var treelength bipartition
8be7c94dcf2d71970c663f6710af40d7.tre 0.0878182 19.8649 0.560344827586
f495e4e0f2f9bbbf091f778067b062f4.tre 0.0230499 11.0301 0.508620689655
0061a6137d1029978876fe13239f57bc.tre 0.0205424 11.2184 0.5
In order of descending priority: root-to-tip variance, tree length, bipartition support
name root-to-tip_var treelength bipartition
4ff6dce0b7cebc9428b4b77e79356484.tre 0.000241636 1.15434 0.0258620689655
589d12bf910a9937d941aa4c836ad87d.tre 0.000935835 2.0223 0.336206896552
fe7aa5d9de0f5f51ef6365ef3b7f1e06.tre 0.00125304 2.60574 0.0603448275862
In order of descending priority: root-to-tip variance, bipartition support, tree length
name root-to-tip_var treelength bipartition
4ff6dce0b7cebc9428b4b77e79356484.tre 0.000241636 1.15434 0.0258620689655
589d12bf910a9937d941aa4c836ad87d.tre 0.000935835 2.0223 0.336206896552
fe7aa5d9de0f5f51ef6365ef3b7f1e06.tre 0.00125304 2.60574 0.0603448275862
In order of descending priority: tree length, root-to-tip variance, bipartition support
name root-to-tip_var treelength bipartition
4ff6dce0b7cebc9428b4b77e79356484.tre 0.000241636 1.15434 0.0258620689655
589d12bf910a9937d941aa4c836ad87d.tre 0.000935835 2.0223 0.336206896552
fe7aa5d9de0f5f51ef6365ef3b7f1e06.tre 0.00125304 2.60574 0.0603448275862
In order of descending priority: tree length, bipartition support, root-to-tip variance
name root-to-tip_var treelength bipartition
4ff6dce0b7cebc9428b4b77e79356484.tre 0.000241636 1.15434 0.0258620689655
589d12bf910a9937d941aa4c836ad87d.tre 0.000935835 2.0223 0.336206896552
fe7aa5d9de0f5f51ef6365ef3b7f1e06.tre 0.00125304 2.60574 0.0603448275862
A bash script was written to automate the following actions:
/Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-114-taxa/FASTA for each locusSplit the alignments for each locus into individual sequences, and these into 50-bp chunks. This step was performed using a custom Python script obtained from http://www.reddit.com/r/bioinformatics/comments/1u8yc7/looking_for_a_script_that_will_split_dna/ceg8rav/?st=j0tbjfco&sh=1e300055.
'''
Splits all sequences within a multi-fasta file into chunks of a specified size.
Fasta header information is retained with each split sequence - its position in
the original is appended to the id. Single-line and multi-line fasta files are
supported. Prints to stout, so pipe to a file to store the result.
Usage:
python splitter.py <filename> <chunksize>
python splitter.py myfile.fa 100
'''
from __future__ import print_function
from sys import argv, version
if version[0] == '2':
from itertools import izip_longest as zl
else:
from itertools import zip_longest as zl
chunksize = int(argv[2])
def writeseq(header, seq):
for i, chunk in enumerate(zl(*[iter(seq)]*chunksize, fillvalue='')):
print(header + '_{}bp'.format(i*chunksize))
print(''.join(chunk))
with open(argv[1]) as f:
header, seq = f.readline().rstrip(), ''
for l in f:
if l[0] != '>':
seq += l.rstrip()
else:
writeseq(header, seq)
header, seq = l.rstrip(), ''
writeseq(header, seq)For each locus, join the individual sequences chunk-wise (i.e., make a single fasta file for all taxa and sites 0 to 50, another one for all taxa and sites 51 to 100, etc.):
library(dplyr)
# Alternating rows (name, sequence, name, sequence) go to two different columns, so that
# each sequence is correctly assigned to its respective taxon:
split_seqs <- read.table("split.txt")
odd <- as.vector(split_seqs[seq(1, nrow(split_seqs), 2), ])
even <- as.vector(split_seqs[seq(2, nrow(split_seqs), 2), ])
odd_name <- "taxon"
even_name <- "sequence"
split_seqs_new <- data.frame(odd, even)
names(split_seqs_new) <- c(odd_name, even_name)
# Determine how long the locus is (i.e., how many 50-bp chunks it has been split into).
# This can be done by counting the number of occurrences of a single taxon name.
# In principle, any name could be used, but since not all of the UCEs include
# all of the taxa, it is advisable to choose a taxon common to all the loci.
n <- length(unique(grep("chaetodon_kleinii", split_seqs_new[,1], value = TRUE)))
# Create a vector of strings that can filter taxon names according to the base pair range
# tag attached to their end
chunks <- vector()
for(i in 0:(n-1)) {
chunks <- append(chunks, paste("_", i*50, "bp", sep = ""))
}
# Create a list of data frames. Each element of the list represents a base pair range
# and consists of a data frame containing both the "taxon" and "sequence" columns of
# split_seqs_new
partition <- list()
for(i in 1:length(chunks)) {
partition[[i]] <- data.frame(filter(split_seqs_new,
grepl(as.character(chunks[i]), taxon)))
}
# Create a matrix whose columns represents individual chunks (i.e., base pair ranges)
# and whose rows have the structure of the original split_seqs data frame -- i.e., name,
# sequence, name, sequence:
chunkmatrix <- matrix(ncol = length(partition),
nrow = 2*(nrow(split_seqs_new)/length(partition)))
for(i in 1:length(partition)) {
for(j in 1:nrow(partition[[i]])) {
chunkmatrix[(2*j - 1), i] <- as.character(partition[[i]][j, "taxon"])
chunkmatrix[2*j, i] <- as.character(partition[[i]][j, "sequence"])
}
}
# Print the resulting fasta files!
for(i in 1:ncol(chunkmatrix)) {
write(chunkmatrix[,i], paste("chunk_", i, ".fasta", sep = ""))
}Add a locus-indicating prefix to all the chunks of a given UCE:
find *.fasta -maxdepth 0 ! -path . -exec mv {} uce-1005_{} \;Copy the resulting fasta files into a single directory.
The contents of the directory were then summarized as follows:
cd /Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-114-taxa/Chunks/
ls > /Users/David/Grive/Alfaro_Lab/SortaDate/Set3-50bp-chunks.txt
Now, change the taxon names in the chunk FASTA files so that they correspond to the names in the reference tree. First, create a file with the names of all the chunk files:
cd /Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-114-taxa/Chunks/
ls *.fasta > /Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-114-taxa/Chunks/chunklist.txt
a <- read.table("/Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-114-taxa/Chunks/chunklist.txt")
x <- vector()
for(i in 1:nrow(a)) {
x <- append(x, paste("/Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-114-taxa/Chunks/", a[i,], sep = ""))
}
for(i in 1:length(x)) {
c <- read.table(print(x[i]), stringsAsFactors = FALSE)
d <- vector()
for(j in 1:(nrow(c)/2)) {
d[j] <- as.character(c[(2*j-1),])
}
e <- vector()
for(j in 1:length(d)) {
e[j] <- paste(sapply(strsplit(d[j], split="_", fixed=TRUE), function(x) (x[1])),
"_",
sapply(strsplit(d[j], split="_", fixed=TRUE), function(x) (x[2])),
sep = "")
}
for(j in 1:(nrow(c)/2)) {
c[(2*j-1),] <- e[j]
}
write(as.matrix(c), print(x[i]), ncolumns=1)
}
A script was generated to analyze all of the alignment in the directory using RAxML:
chunk_table <- read.table("Set3-50bp-chunks.txt")
newfolders <- vector()
for(i in chunk_table) {
newfolders <- append(newfolders, paste("ete3 build -w standard_raxml -n /Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-114-taxa/Chunks/", i, sep=""))
}
with_endings <- as.character(as.vector(as.matrix(chunk_table)))
chunk_names <- vector()
for(i in with_endings) {
chunk_names <- append(chunk_names, sapply(strsplit(i, split='.', fixed=TRUE), function(x) (x[1])))
}
tree_location <- vector()
for(i in chunk_names) {
tree_location <- append(tree_location, paste(" -o /Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-114-taxa/", i, sep=""))
}
analyzechunks <- paste(newfolders, tree_location, sep="")
write(analyzechunks, "chunk-analysis.sh")
The resulting tree files were then copied into the directory containing the alignments:
copyfrom1 <- vector()
for(i in chunk_names) {
copyfrom1 <- append(copyfrom1, paste("cp /Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-114-taxa/", i, sep = ""))
}
copyfrom2 <- vector()
for(i in chunk_names) {
copyfrom2 <- append(copyfrom2, paste("/clustalo_default-none-none-raxml_default/", i, sep = ""))
}
copyto <- paste(copyfrom1, copyfrom2, ".fasta.final_tree.nw /Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-114-taxa/Chunks/", sep="")
write(copyto, "copy-chunk-trees.sh")
Rename the trees:
brew install rename
rename -S .fasta.final_tree.nw .tre *.fasta.final_tree.nw
Running ls *.fasta | wc -l and ls *.nw | wc -l in the directory showed that out of 1826 chunk FASTA files, no more than 1070 had tree files associated with them.
python src/get_var_length.py /Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-114-taxa/Chunks/ --flend .tre --outf Chunk_analysis/var-chunks --outg alepisaurus_ferox,ceratoscopelus_warmingii
python src/get_bp_genetrees.py /Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-114-taxa/Chunks/ /Users/David/Grive/Alfaro_Lab/Acanthomorpha/12_no_outgroups_scheme_3.tre --flend .tre --outf Chunk_analysis/bp-chunks
python src/combine_results.py Chunk_analysis/var-chunks Chunk_analysis/bp-chunks --outf Chunk_analysis/comb-chunks
Calculate the Robinson-Foulds distances of the individual chunk trees from the reference tree using the Python script below:
import os, uuid
from ete3 import Tree
t2 = Tree("/Users/David/Grive/Alfaro_Lab/Acanthomorpha/12_no_outgroups_scheme_3.tre")
for file in os.listdir("/Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-114-taxa/Chunks/"):
if file.endswith(".tre"):
t1 = Tree(file)
try:
rf = t1.robinson_foulds(t2)
print str(file), (rf[0])
except:
pass
No effect: the minimum observed tree length is non-zero and equal to:
## [1] 8.80081e-05
In order of descending priority: bipartition support, root-to-tip variance, tree length
name root-to-tip_var treelength bipartition
uce-157_chunk_7.tre 0.176994 14.918 0.258620689655
uce-126_chunk_3.tre 0.104056 14.0824 0.224137931034
uce-323_chunk_2.tre 1.16412 48.0104 0.224137931034
## [1] "The Robinson-Foulds distances of the three bets chunks from the reference tree are as follows:"
## chunk rf
## 321 uce-157_chunk_7.tre 164
## chunk rf
## 219 uce-126_chunk_3.tre 166
## chunk rf
## 443 uce-323_chunk_2.tre 168
The chunk with the minimum RF distance from the reference tree:
## chunk rf
## 321 uce-157_chunk_7.tre 164
In order of descending priority: bipartition support, tree length, root-to-tip variance
name root-to-tip_var treelength bipartition
uce-157_chunk_7.tre 0.176994 14.918 0.258620689655
uce-126_chunk_3.tre 0.104056 14.0824 0.224137931034
uce-323_chunk_2.tre 1.16412 48.0104 0.224137931034
In order of descending priority: root-to-tip variance, tree length, bipartition support
name root-to-tip_var treelength bipartition
uce-1243_chunk_4.tre 1.05801e-11 0.0215722 0.0
uce-600_chunk_3.tre 1.28251e-11 0.0205929 0.00862068965517
uce-737_chunk_4.tre 2.16826e-11 0.0215345 0.0
In order of descending priority: root-to-tip variance, bipartition support, tree length
name root-to-tip_var treelength bipartition
uce-1243_chunk_4.tre 1.05801e-11 0.0215722 0.0
uce-600_chunk_3.tre 1.28251e-11 0.0205929 0.00862068965517
uce-737_chunk_4.tre 2.16826e-11 0.0215345 0.0
In order of descending priority: tree length, root-to-tip variance, bipartition support
name root-to-tip_var treelength bipartition
uce-855_chunk_6.tre 3.60737e-11 8.80081e-05 0.0
uce-413_chunk_4.tre 3.51022e-06 0.0205344 0.0
uce-600_chunk_3.tre 1.28251e-11 0.0205929 0.00862068965517
In order of descending priority: tree length, bipartition support, root-to-tip variance
name root-to-tip_var treelength bipartition
uce-855_chunk_6.tre 3.60737e-11 8.80081e-05 0.0
uce-413_chunk_4.tre 3.51022e-06 0.0205344 0.0
uce-600_chunk_3.tre 1.28251e-11 0.0205929 0.00862068965517
Copy all the tree files to a new directory:
cd /Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-114-taxa
mkdir Chunks_75
cp Chunks/*.tre Chunks_75Collapse all the nodes with SH-like support values of less than 75% using the following Python script:
import os, uuid
from ete3 import Tree
for file in os.listdir("/Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-114-taxa/Chunks"):
if file.endswith(".tre"):
outname = "/Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-114-taxa/Chunks_75/" + str(file)
t = Tree(file, format=0)
print t.get_ascii(attributes=['support', 'name'])
for node in t.get_descendants():
if not node.is_leaf() and node.support <= 0.75:
node.delete()
print t.get_ascii(attributes=['support', 'name'])
t.write(format=0, outfile=outname)Copy the fasta alignments to the same directory:
cp Chunks/*.fasta Chunks_75python src/get_var_length.py /Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-114-taxa/Chunks_75/ --flend .tre --outf Chunk75_analysis/var-chunks75 --outg alepisaurus_ferox,ceratoscopelus_warmingii
python src/get_bp_genetrees.py /Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-114-taxa/Chunks_75/ /Users/David/Grive/Alfaro_Lab/Acanthomorpha/12_no_outgroups_scheme_3.tre --flend .tre --outf Chunk75_analysis/bp-chunks75
python src/combine_results.py Chunk75_analysis/var-chunks75 Chunk75_analysis/bp-chunks75 --outf Chunk75_analysis/comb-chunks75
A script was written to delete all lines containing NAs, as well as all lines that only contained an entry for bipartition support but none for root-to-tip branch length variance or tree length. This was accomplished by filling these partially empty lines with NAs in the first step and deleting them in the second step:
combchunks75 <- read.table("/Users/David/Grive/Alfaro_Lab/SortaDate/Chunk75_analysis/comb-chunks75", fill = TRUE)
filtered <- na.omit(combchunks75)
write.table(filtered,
"/Users/David/Grive/Alfaro_Lab/SortaDate/Chunk75_analysis/comb-chunks75-filtered",
sep = "\t",
quote = FALSE,
row.names = FALSE,
col.names = FALSE)
The length of the filtered comb file is 901 lines, meaning that SortaDate failed to obtain root-to-tip variance and/or tree length for 169 trees.
No effect: the minimum observed tree length is non-zero and equal to:
## [1] 0.0202802
Note that this result is orders of magnitude larger than those observed in the other five chunk datasets.
In order of descending priority: bipartition support, root-to-tip variance, tree length
name root-to-tip_var treelength bipartition
uce-157_chunk_7.tre 0.163848 13.508 0.198275862069
uce-126_chunk_3.tre 0.101031 13.9331 0.181034482759
uce-323_chunk_2.tre 0.459664 40.9379 0.146551724138
## [1] "The Robinson-Foulds distances of the three best chunks from the reference tree are as follows:"
## chunk rf
## 284 uce-157_chunk_7.tre 126
## chunk rf
## 194 uce-126_chunk_3.tre 114
## chunk rf
## 391 uce-323_chunk_2.tre 125
The chunk with the minimum RF distance from the reference tree:
## chunk rf
## 189 uce-1263_chunk_2.tre 106
In order of descending priority: bipartition support, tree length, root-to-tip variance
name root-to-tip_var treelength bipartition
uce-157_chunk_7.tre 0.163848 13.508 0.198275862069
uce-126_chunk_3.tre 0.101031 13.9331 0.181034482759
uce-323_chunk_2.tre 0.459664 40.9379 0.146551724138
In order of descending priority: root-to-tip variance, tree length, bipartition support
name root-to-tip_var treelength bipartition
uce-815_chunk_8.tre 0.0 0.0202802 0.0
uce-359_chunk_7.tre 0.0 0.0203467 0.00862068965517
uce-160_chunk_3.tre 0.0 0.0204413 0.0
In order of descending priority: root-to-tip variance, bipartition support, tree length
name root-to-tip_var treelength bipartition
uce-200_chunk_2.tre 0.0 0.0317025 0.0344827586207
uce-129_chunk_5.tre 0.0 0.0211935 0.0258620689655
uce-1062_chunk_5.tre 0.0 0.113443 0.0258620689655
In order of descending priority: tree length, root-to-tip variance, bipartition support
name root-to-tip_var treelength bipartition
uce-815_chunk_8.tre 0.0 0.0202802 0.0
uce-359_chunk_7.tre 0.0 0.0203467 0.00862068965517
uce-160_chunk_3.tre 0.0 0.0204413 0.0
In order of descending priority: tree length, bipartition support, root-to-tip variance
name root-to-tip_var treelength bipartition
uce-815_chunk_8.tre 0.0 0.0202802 0.0
uce-359_chunk_7.tre 0.0 0.0203467 0.00862068965517
uce-160_chunk_3.tre 0.0 0.0204413 0.0
(The first three steps were identical to those described above.)
python src/get_var_length.py /Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-114-taxa/Chunks_90/ --flend .tre --outf Chunk90_analysis/var-chunks90 --outg alepisaurus_ferox,ceratoscopelus_warmingii
python src/get_bp_genetrees.py /Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-114-taxa/Chunks_90/ /Users/David/Grive/Alfaro_Lab/Acanthomorpha/12_no_outgroups_scheme_3.tre --flend .tre --outf Chunk90_analysis/bp-chunks90
python src/combine_results.py Chunk90_analysis/var-chunks90 Chunk90_analysis/bp-chunks90 --outf Chunk90_analysis/comb-chunks90
Delete the lines with NAs:
combchunks90 <- read.table("/Users/David/Grive/Alfaro_Lab/SortaDate/Chunk90_analysis/comb-chunks90", fill = TRUE)
filtered <- na.omit(combchunks90)
write.table(filtered,
"/Users/David/Grive/Alfaro_Lab/SortaDate/comb-chunks90-filtered",
sep = "\t",
quote = FALSE,
row.names = FALSE,
col.names = FALSE)
The length of the filtered comb file is 536 lines, meaning that SortaDate failed to obtain root-to-tip variance and/or tree length for 534 (almost 50%) trees.
No effect: the minimum observed tree length is non-zero and equal to:
## [1] 8.71304e-07
In order of descending priority: bipartition support, root-to-tip variance, tree length
name root-to-tip_var treelength bipartition
uce-1317_chunk_8.tre 0.0166339 15.2225 0.0689655172414
uce-120_chunk_6.tre 0.187599 4.6936 0.0689655172414
uce-323_chunk_2.tre 0.325194 35.4206 0.0689655172414
## [1] "The Robinson-Foulds distances of the three best chunks from the reference tree are as follows:"
## chunk rf
## 159 uce-1317_chunk_8.tre 111
## chunk rf
## 81 uce-120_chunk_6.tre 114
## chunk rf
## 248 uce-323_chunk_2.tre 107
The chunk with the minimum RF distance from the reference tree:
## chunk rf
## 235 uce-267_chunk_7.tre 106
In order of descending priority: bipartition support, tree length, root-to-tip variance
name root-to-tip_var treelength bipartition
uce-120_chunk_6.tre 0.187599 4.6936 0.0689655172414
uce-1317_chunk_8.tre 0.0166339 15.2225 0.0689655172414
uce-323_chunk_2.tre 0.325194 35.4206 0.0689655172414
In order of descending priority: root-to-tip variance, tree length, bipartition support
name root-to-tip_var treelength bipartition
uce-537_chunk_3.tre 0.0 8.71304e-07 0.00862068965517
uce-84_chunk_5.tre 0.0 9.84452e-07 0.0172413793103
uce-739_chunk_2.tre 0.0 2.2254e-06 0.0344827586207
In order of descending priority: root-to-tip variance, bipartition support, tree length
name root-to-tip_var treelength bipartition
uce-739_chunk_2.tre 0.0 2.2254e-06 0.0344827586207
uce-832_chunk_4.tre 0.0 1.05876 0.0258620689655
uce-84_chunk_5.tre 0.0 9.84452e-07 0.0172413793103
In order of descending priority: tree length, root-to-tip variance, bipartition support
name root-to-tip_var treelength bipartition
uce-537_chunk_3.tre 0.0 8.71304e-07 0.00862068965517
uce-84_chunk_5.tre 0.0 9.84452e-07 0.0172413793103
uce-739_chunk_2.tre 0.0 2.2254e-06 0.0344827586207
In order of descending priority: tree length, bipartition support, root-to-tip variance
name root-to-tip_var treelength bipartition
uce-537_chunk_3.tre 0.0 8.71304e-07 0.00862068965517
uce-84_chunk_5.tre 0.0 9.84452e-07 0.0172413793103
uce-739_chunk_2.tre 0.0 2.2254e-06 0.0344827586207
A bash script used to split the UCE loci into 50-bp chunks was identical to that used above, with the exception of step 3 (joining all taxa represented by a given chunk into a new FASTA file), for which the following code was used:
library(dplyr)
# Alternating rows (name, sequence, name, sequence) go to two different columns, so that
# each sequence is correctly assigned to its respective taxon:
split_seqs <- read.table("split.txt")
odd <- as.vector(split_seqs[seq(1, nrow(split_seqs), 2), ])
even <- as.vector(split_seqs[seq(2, nrow(split_seqs), 2), ])
odd_name <- "taxon"
even_name <- "sequence"
split_seqs_new <- data.frame(odd, even)
names(split_seqs_new) <- c(odd_name, even_name)
# Determine how long the locus is (i.e., how many 50-bp chunks it has been split into).
# This can be done by counting the number of occurrences of a single taxon name.
# In principle, any name could be used, but since no taxon appears to be common to all
# the loci, the code below grabs the first taxon name appearing in the fasta file and
# counts its occurrences.
m <- paste(sapply(strsplit(as.character(split_seqs_new[1,1]), split="_", fixed=TRUE),
function(x) (x[1])),
"_", sapply(strsplit(as.character(split_seqs_new[1,1]), split="_", fixed=TRUE), function(x) (x[2])),
sep = "")
n <- length(unique(grep(m, split_seqs_new[,1], value = TRUE)))
# Create a vector of strings that can filter taxon names according to the base pair range
# tag attached to their end
chunks <- vector()
for(i in 0:(n-1)) {
chunks <- append(chunks, paste("_", i*50, "bp", sep = ""))
}
# Create a list of data frames. Each element of the list represents a base pair range
# and consists of a data frame containing both the "taxon" and "sequence" columns of
# split_seqs_new
partition <- list()
for(i in 1:length(chunks)) {
partition[[i]] <- data.frame(filter(split_seqs_new,
grepl(as.character(chunks[i]), taxon)))
}
# Create a matrix whose columns represents individual chunks (i.e., base pair ranges)
# and whose rows have the structure of the original split_seqs data frame -- i.e., name,
# sequence, name, sequence:
chunkmatrix <- matrix(ncol = length(partition),
nrow = 2*(nrow(split_seqs_new)/length(partition)))
for(i in 1:length(partition)) {
for(j in 1:nrow(partition[[i]])) {
chunkmatrix[(2*j - 1), i] <- as.character(partition[[i]][j, "taxon"])
chunkmatrix[2*j, i] <- as.character(partition[[i]][j, "sequence"])
}
}
# Print the resulting fasta files!
for(i in 1:ncol(chunkmatrix)) {
write(chunkmatrix[,i], paste("chunk_", i, ".fasta", sep = ""))
}
The taxon names in the chunk FASTA files were then stripped of the chunk-indicating suffixes so as to correspond with the names used in the reference tree, and a script was used to analyze all the chunks using RAxML. The resulting trees were then copied into the directory containing the chunks. Out of 6543 chunk FASTA files, only 4237 had tree files associated with them, suggesting that RAxML failed to infer a tree for a given chunk in approx. 35% of cases.
python src/get_var_length.py /Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-90-taxa/Chunks/ --flend .tre --outf Min-90-chunk_analysis/var-min-90-chunks --outg alepisaurus_ferox,ceratoscopelus_warmingii
Running the script produced 1029 warnings about either Alepisaurus ferox or Ceratoscopelus warmingii missing from the chunk tree. Despite this, the tree length and root-to-tip variance was computed for all the 4237 trees.
python src/get_bp_genetrees.py /Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-90-taxa/Chunks/ /Users/David/Grive/Alfaro_Lab/Acanthomorpha/12_no_outgroups_scheme_3.tre --flend .tre --outf Min-90-chunk_analysis/bp-min-90-chunks
python src/combine_results.py Min-90-chunk_analysis/var-min-90-chunks Min-90-chunk_analysis/bp-min-90-chunks --outf Min-90-chunk_analysis/comb-min-90-chunks
In order of descending priority: bipartition support, root-to-tip variance, tree length
name root-to-tip_var treelength bipartition
uce-157_chunk_7.tre 0.176994 14.918 0.258620689655
uce-1244_chunk_1.tre 0.0534134 11.1586 0.224137931034
uce-126_chunk_3.tre 0.104056 14.0824 0.224137931034
## [1] "The Robinson-Foulds distances of the three best chunks from the reference tree are as follows:"
## chunk rf
## 1229 uce-157_chunk_7.tre 164
## chunk rf
## 822 uce-1244_chunk_1.tre 158
## chunk rf
## 907 uce-126_chunk_3.tre 166
The chunk with the minimum RF distance from the reference tree:
## chunk rf
## 6 uce-1000_chunk_6.tre 136
In order of descending priority: bipartition support, tree length, root-to-tip variance
name root-to-tip_var treelength bipartition
uce-157_chunk_7.tre 0.176994 14.918 0.258620689655
uce-1244_chunk_1.tre 0.0534134 11.1586 0.224137931034
uce-126_chunk_3.tre 0.104056 14.0824 0.224137931034
In order of descending priority: root-to-tip variance, tree length, bipartition support
name root-to-tip_var treelength bipartition
uce-956_chunk_5.tre 2.3002e-12 8.56621e-05 0.0
uce-393_chunk_3.tre 5.54758e-12 9.22463e-05 0.0
uce-921_chunk_4.tre 5.8998e-12 7.89216e-05 0.0
In order of descending priority: root-to-tip variance, bipartition support, tree length
name root-to-tip_var treelength bipartition
uce-956_chunk_5.tre 2.3002e-12 8.56621e-05 0.0
uce-393_chunk_3.tre 5.54758e-12 9.22463e-05 0.0
uce-921_chunk_4.tre 5.8998e-12 7.89216e-05 0.0
In order of descending priority: tree length, root-to-tip variance, bipartition support
name root-to-tip_var treelength bipartition
uce-921_chunk_4.tre 5.8998e-12 7.89216e-05 0.0
uce-1173_chunk_3.tre 1.96604e-11 8.52591e-05 0.0
uce-956_chunk_5.tre 2.3002e-12 8.56621e-05 0.0
In order of descending priority: tree length, bipartition support, root-to-tip variance
name root-to-tip_var treelength bipartition
uce-921_chunk_4.tre 5.8998e-12 7.89216e-05 0.0
uce-1173_chunk_3.tre 1.96604e-11 8.52591e-05 0.0
uce-956_chunk_5.tre 2.3002e-12 8.56621e-05 0.0
No effect: the minimum observed tree length is non-zero and equal to:
## [1] 7.89216e-05
## Number of trees after filtering: 4237
## Percentage of trees after filtering: 100%
## Original min. tree length: 7.89216e-05
## Original max. tree length: 214.974
## Min. tree length after step 2: 0.285338
## Max. tree length after step 2: 24.1424
## Number of trees after step 2: 3389
## Percentage of trees after step 2: 79.98584%
## Max. root-to-tip variance from step 2: 2.72349
## Max. root-to-tip variance after step 3: 0.00905244
## Number of trees after step 3: 2270
## Percentage of trees after step 3: 53.57564%
## Min. bipartition support from step 3: 0
## Min. bipartition support after step 4: 0.1034483
## Number of trees after step 4: 170
## Percentage of trees after step 4: 4.012273%
The nodes with SH-like support values below 75% were collapsed using the script above. The corresponding FASTA files were copied into the resulting directory, and SortaDate was run as follows:
python src/get_var_length.py /Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-90-taxa/Chunks_75/ --flend .tre --outf Min-90-chunk75_analysis/var-min-90-chunks75 --outg alepisaurus_ferox,ceratoscopelus_warmingii
Running the script resulted in occasional segmentation faults as well as “this really only works with nexus or newick” warning messages. The resulting file had the full number of lines (4237: one for each tree), but some of them were blank and others contained NAs.
python src/get_bp_genetrees.py /Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-90-taxa/Chunks_75/ /Users/David/Grive/Alfaro_Lab/Acanthomorpha/12_no_outgroups_scheme_3.tre --flend .tre --outf Min-90-chunk75_analysis/bp-min-90-chunks75
python src/combine_results.py Min-90-chunk75_analysis/var-min-90-chunks75 Min-90-chunk75_analysis/bp-min-90-chunks75 --outf Min-90-chunk75_analysis/comb-min-90-chunks75
Delete the lines with NAs:
comb90chunks75 <- read.table("/Users/David/Grive/Alfaro_Lab/SortaDate/Min-90-chunk75_analysis/comb-min-90-chunks75", fill = TRUE)
filtered <- na.omit(comb90chunks75)
write.table(filtered,
"/Users/David/Grive/Alfaro_Lab/SortaDate/Min-90-chunk75_analysis/comb-min-90-chunks75-filtered",
sep = "\t",
quote = FALSE,
row.names = FALSE,
col.names = FALSE)
nrow(filtered)
The length of the filtered comb file is 3589 lines, meaning that SortaDate failed to obtain root-to-tip variance and/or tree length for 648 trees.
In order of descending priority: bipartition support, root-to-tip variance, tree length
name root-to-tip_var treelength bipartition
uce-157_chunk_7.tre 0.163848 13.508 0.198275862069
uce-1244_chunk_1.tre 0.040797 10.3002 0.181034482759
uce-126_chunk_3.tre 0.101031 13.9331 0.181034482759
## [1] "The Robinson-Foulds distances of the three best chunks from the reference tree are as follows:"
## chunk rf
## 1064 uce-157_chunk_7.tre 126
## chunk rf
## 710 uce-1244_chunk_1.tre 111
## chunk rf
## 788 uce-126_chunk_3.tre 114
The chunk with the minimum RF distance from the reference tree:
## chunk rf
## 2016 uce-463_chunk_2.tre 85
In order of descending priority: bipartition support, tree length, root-to-tip variance
name root-to-tip_var treelength bipartition
uce-157_chunk_7.tre 0.163848 13.508 0.198275862069
uce-1244_chunk_1.tre 0.040797 10.3002 0.181034482759
uce-126_chunk_3.tre 0.101031 13.9331 0.181034482759
In order of descending priority: root-to-tip variance, tree length, bipartition support
name root-to-tip_var treelength bipartition
uce-1175_chunk_4.tre 0.0 8.48388e-07 0.00862068965517
uce-547_chunk_2.tre 0.0 1.18678e-06 0.0172413793103
uce-815_chunk_8.tre 0.0 0.0202802 0.0
In order of descending priority: root-to-tip variance, bipartition support, tree length
name root-to-tip_var treelength bipartition
uce-67_chunk_4.tre 0.0 0.0214551 0.0603448275862
uce-200_chunk_2.tre 0.0 0.0317025 0.0344827586207
uce-525_chunk_3.tre 0.0 0.020706 0.0258620689655
In order of descending priority: tree length, root-to-tip variance, bipartition support
name root-to-tip_var treelength bipartition
uce-1175_chunk_4.tre 0.0 8.48388e-07 0.00862068965517
uce-547_chunk_2.tre 0.0 1.18678e-06 0.0172413793103
uce-815_chunk_8.tre 0.0 0.0202802 0.0
In order of descending priority: tree length, bipartition support, root-to-tip variance
name root-to-tip_var treelength bipartition
uce-1175_chunk_4.tre 0.0 8.48388e-07 0.00862068965517
uce-547_chunk_2.tre 0.0 1.18678e-06 0.0172413793103
uce-815_chunk_8.tre 0.0 0.0202802 0.0
No effect: the minimum observed tree length is non-zero and equal to:
## [1] 8.48388e-07
## Number of trees after filtering: 3589
## Percentage of trees after filtering: 100%
## Original min. tree length: 8.48388e-07
## Original max. tree length: 209.395
## Min. tree length after step 2: 0.124261
## Max. tree length after step 2: 19.679
## Number of trees after step 2: 2871
## Percentage of trees after step 2: 79.99443%
## Max. root-to-tip variance from step 2: 6.66986
## Max. root-to-tip variance after step 3: 0.00645959
## Number of trees after step 3: 1923
## Percentage of trees after step 3: 53.58038%
## Min. bipartition support from step 3: 0
## Min. bipartition support after step 4: 0.06034483
## Number of trees after step 4: 169
## Percentage of trees after step 4: 4.708833%
python src/get_var_length.py /Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-90-taxa/Chunks_90/ --flend .tre --outf Min-90-chunk90_analysis/var-min-90-chunks90 --outg alepisaurus_ferox,ceratoscopelus_warmingii
As in the previous case, the command led to a number of segfaults and “this really only works with nexus or newick” warnings, corresponding to lines in the resulting file that were either incomplete or included NAs.
python src/get_bp_genetrees.py /Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-90-taxa/Chunks_90/ /Users/David/Grive/Alfaro_Lab/Acanthomorpha/12_no_outgroups_scheme_3.tre --flend .tre --outf Min-90-chunk90_analysis/bp-min-90-chunks90
python src/combine_results.py Min-90-chunk90_analysis/var-min-90-chunks90 Min-90-chunk90_analysis/bp-min-90-chunks90 --outf Min-90-chunk90_analysis/comb-min-90-chunks90
Delete the lines with NAs:
comb90chunks90 <- read.table("/Users/David/Grive/Alfaro_Lab/SortaDate/Min-90-chunk90_analysis/comb-min-90-chunks90", fill = TRUE)
filtered <- na.omit(comb90chunks90)
write.table(filtered,
"/Users/David/Grive/Alfaro_Lab/SortaDate/Min-90-chunk90_analysis/comb-min-90-chunks90-filtered",
sep = "\t",
quote = FALSE,
row.names = FALSE,
col.names = FALSE)
nrow(filtered)
The length of the filtered comb file is 2319 lines, meaning that SortaDate failed to obtain root-to-tip variance and/or tree length for 1918 trees.
In order of descending priority: bipartition support, root-to-tip variance, tree length
name root-to-tip_var treelength bipartition
uce-625_chunk_10.tre 0.0675117 0.706842 0.0775862068966
uce-1317_chunk_8.tre 0.0166339 15.2225 0.0689655172414
uce-120_chunk_6.tre 0.187599 4.6936 0.0689655172414
## [1] "The Robinson-Foulds distances of the three best chunks from the reference tree are as follows:"
## chunk rf
## 1686 uce-625_chunk_10.tre 106
## chunk rf
## 629 uce-1317_chunk_8.tre 111
## chunk rf
## 429 uce-120_chunk_6.tre 114
The chunk with the minimum RF distance from the reference tree:
## chunk rf
## 678 uce-1340_chunk_2.tre 85
In order of descending priority: bipartition support, tree length, root-to-tip variance
name root-to-tip_var treelength bipartition
uce-625_chunk_10.tre 0.0675117 0.706842 0.0775862068966
uce-120_chunk_6.tre 0.187599 4.6936 0.0689655172414
uce-1317_chunk_8.tre 0.0166339 15.2225 0.0689655172414
In order of descending priority: root-to-tip variance, tree length, bipartition support
name root-to-tip_var treelength bipartition
uce-1175_chunk_4.tre 0.0 8.48388e-07 0.00862068965517
uce-675_chunk_5.tre 0.0 8.58914e-07 0.0
uce-537_chunk_3.tre 0.0 8.71304e-07 0.00862068965517
In order of descending priority: root-to-tip variance, bipartition support, tree length
name root-to-tip_var treelength bipartition
uce-1189_chunk_2.tre 0.0 1.52836 0.0431034482759
uce-739_chunk_2.tre 0.0 2.2254e-06 0.0344827586207
uce-67_chunk_4.tre 0.0 0.0214551 0.0344827586207
In order of descending priority: tree length, root-to-tip variance, bipartition support
name root-to-tip_var treelength bipartition
uce-1175_chunk_4.tre 0.0 8.48388e-07 0.00862068965517
uce-675_chunk_5.tre 0.0 8.58914e-07 0.0
uce-537_chunk_3.tre 0.0 8.71304e-07 0.00862068965517
In order of descending priority: tree length, bipartition support, root-to-tip variance
name root-to-tip_var treelength bipartition
uce-1175_chunk_4.tre 0.0 8.48388e-07 0.00862068965517
uce-675_chunk_5.tre 0.0 8.58914e-07 0.0
uce-537_chunk_3.tre 0.0 8.71304e-07 0.00862068965517
No effect: the minimum observed tree length is non-zero and equal to:
## [1] 8.48388e-07
## Number of trees after filtering: 2319
## Percentage of trees after filtering: 100%
## Original min. tree length: 8.48388e-07
## Original max. tree length: 141.225
## Min. tree length after step 2: 0.111172
## Max. tree length after step 2: 20.8823
## Number of trees after step 2: 1855
## Percentage of trees after step 2: 79.99138%
## Max. root-to-tip variance from step 2: 40.1753
## Max. root-to-tip variance after step 3: 0.00177898
## Number of trees after step 3: 1243
## Percentage of trees after step 3: 53.60069%
## Min. bipartition support from step 3: 0
## Min. bipartition support after step 4: 0.03448276
## Number of trees after step 4: 66
## Percentage of trees after step 4: 2.846054%